{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "afc43f0f",
   "metadata": {},
   "source": [
    "### What this script does\n",
    "\n",
    "- Loads KNMI wind data (.txt) and 10-min merged mast/sonic/radiometer data (or any dataset containing wind direction) (merged_data_10min.csv files).\n",
    "\n",
    "- Filters KNMI data for Feb 28 – Jun 13, 2024 and plots wind direction vs. time.\n",
    "\n",
    "- Loads and combines all 10-min datasets from multiple monthly folders.\n",
    "\n",
    "- Compares wind direction (KNMI vs Sonic) through time-series and correlation plots (Note: it is not wind direction measured by the sonic but wind direction measured through the wind vanes on the mast at a certain height).\n",
    "\n",
    "- Computes Pearson and Spearman correlations between the two datasets.\n",
    "\n",
    "- Calculates and visualizes the wind direction offset (Δθ = KNMI − Sonic) and applies a mean correction.\n",
    "\n",
    "#### Edit before running\n",
    "- Path to your KNMI dataset:\n",
    "\n",
    "file_path = r\"C:\\path\\to\\your\\KNMI\\uurgeg_235_2021-2030.txt\"\n",
    "\n",
    "- Base directory containing the monthly data folders\n",
    "\n",
    "base_dir = r\"C:\\path\\to\\your\\Sonic\"\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "cd7f089e",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import os\n",
    "import matplotlib.pyplot as plt\n",
    "import scipy.stats as stats\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "bbbcd5e9",
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "# Path to your KNMI dataset\n",
    "# ⚠️ Edit this path to match your local setup before running.\n",
    "# Example: file_path = r\"D:\\Thesis\\data\\KNMI\\uurgeg_235_2021-2030.txt\"\n",
    "file_path = r\"C:\\path\\to\\your\\KNMI\\uurgeg_235_2021-2030.txt\"\n",
    "\n",
    "\n",
    "## Load the dataset\n",
    "try:\n",
    "    with open(file_path, 'r', encoding='utf-8') as file:\n",
    "        lines = file.readlines()\n",
    "    \n",
    "    # Identify the correct header row manually\n",
    "    header_row = 31  # Adjust based on inspection of metadata\n",
    "    \n",
    "    # Print metadata before the header row separately\n",
    "    metadata = lines[:header_row]\n",
    "    print(\"Metadata before the header row:\")\n",
    "    for line in metadata:\n",
    "        print(line.strip())\n",
    "    \n",
    "    print(f\"\\nUsing line {header_row} as header row.\")\n",
    "    \n",
    "    # Read the data with proper headers\n",
    "    df = pd.read_csv(file_path, skiprows=header_row, sep=\",\", engine='python', on_bad_lines='skip')\n",
    "    \n",
    "    # Ensure column names are properly formatted\n",
    "    df.columns = df.columns.str.strip()\n",
    "    \n",
    "    # Print detected column names\n",
    "    print(\"Detected Columns:\", df.columns.tolist())\n",
    "    \n",
    "    # Ensure the correct date column is used\n",
    "    if 'YYYYMMDD' in df.columns and 'HH' in df.columns:\n",
    "        df['Timestamp'] = pd.to_datetime(df['YYYYMMDD'].astype(str) + df['HH'].astype(str).str.zfill(2), format='%Y%m%d%H', errors='coerce')\n",
    "        \n",
    "        # Filter data for February 28, 2024, to June 13, 2024\n",
    "        start_date = '2024-02-28'\n",
    "        end_date = '2024-06-13'\n",
    "        df_filtered = df[(df['Timestamp'] >= start_date) & (df['Timestamp'] <= end_date)]\n",
    "        \n",
    "        # Plot Wind Direction vs Time for the filtered dataset\n",
    "        if 'DD' in df_filtered.columns:\n",
    "            plt.figure(figsize=(12, 6))\n",
    "            plt.scatter(df_filtered['Timestamp'], df_filtered['DD'], alpha=0.5, label='Wind Direction (Degrees)')\n",
    "            plt.xlabel('Time')\n",
    "            plt.ylabel('Wind Direction (Degrees)')\n",
    "            plt.title('Wind Direction vs Time (Feb 28 - Jun 13, 2024)')\n",
    "            plt.legend()\n",
    "            plt.xticks(rotation=45)\n",
    "            plt.grid()\n",
    "            plt.show()\n",
    "        else:\n",
    "            print(\"Error: No wind direction (DD) column found in dataset.\")\n",
    "    else:\n",
    "        print(\"Error: No valid date column found. Check the dataset format.\")\n",
    "    \n",
    "except FileNotFoundError:\n",
    "    print(f\"Error: The file '{file_path}' was not found.\")\n",
    "except Exception as e:\n",
    "    print(f\"An error occurred: {e}\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4d896239",
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    " print(df_filtered)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "26418cab",
   "metadata": {
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "\n",
    "# Base directory containing the monthly 10-min merged sonic/mast/radiometer data folders\n",
    "# ⚠️ Edit this path to match your local setup before running.\n",
    "# Example: base_dir = r\"D:\\Thesis\\data\\Sonic\"\n",
    "base_dir = r\"C:\\path\\to\\your\\Sonic\"\n",
    "\n",
    "\n",
    "# List of months to process\n",
    "months = ['2024-03', '2024-04', '2024-05']#, '2024-06']\n",
    "\n",
    "# Initialize an empty list to store data\n",
    "all_data = []\n",
    "\n",
    "# Loop through each month's folder\n",
    "for month in months:\n",
    "    month_dir = os.path.join(base_dir, month)\n",
    "    \n",
    "    # Check if the monthly directory exists\n",
    "    if not os.path.exists(month_dir):\n",
    "        print(f\"Directory does not exist: {month_dir}\")\n",
    "        continue\n",
    "    \n",
    "    # Loop through each day's folder in the month\n",
    "    for day in os.listdir(month_dir):\n",
    "        day_dir = os.path.join(month_dir, day)\n",
    "        \n",
    "        # Check if it's a directory\n",
    "        if not os.path.isdir(day_dir):\n",
    "            continue\n",
    "        \n",
    "        # Define the path to the 'merged_data_10min.csv' file\n",
    "        file_path = os.path.join(day_dir, 'merged_data_10min.csv')\n",
    "        \n",
    "        # Check if the file exists\n",
    "        if os.path.exists(file_path):\n",
    "            # Load the data and append it to the list\n",
    "            try:\n",
    "                data = pd.read_csv(file_path)\n",
    "                all_data.append(data)\n",
    "                print(f\"Loaded data from: {file_path}\")\n",
    "            except Exception as e:\n",
    "                print(f\"Error loading file {file_path}: {e}\")\n",
    "        else:\n",
    "            print(f\"File does not exist: {file_path}\")\n",
    "\n",
    "# Combine all the data into one DataFrame\n",
    "if all_data:\n",
    "    combined_data = pd.concat(all_data, ignore_index=True)\n",
    "    print(\"Successfully combined all data.\")\n",
    "else:\n",
    "    combined_data = pd.DataFrame()  # Empty DataFrame if no data found\n",
    "    print(\"No data files found.\")\n",
    "\n",
    "# Output the first few rows of the combined DataFrame\n",
    "print(combined_data.head())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "72fe13d6",
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "# Plot Wind Direction vs Time for KNMI and Sonic data\n",
    "plt.figure(figsize=(12, 6))\n",
    "\n",
    "# KNMI Wind Direction\n",
    "if 'Timestamp' in df_filtered.columns and 'DD' in df_filtered.columns:\n",
    "    plt.scatter(df_filtered['Timestamp'], df_filtered['DD'], alpha=0.5, label='KNMI Wind Direction (Degrees)', color='blue')\n",
    "\n",
    "# Mast Wind Direction\n",
    "if 'TIMESTAMP' in combined_data.columns and 'WindDir_D15008_Avg' in combined_data.columns:\n",
    "    plt.scatter(combined_data['TIMESTAMP'], combined_data['WindDir_D15008_Avg'], alpha=0.5, label='Sonic Wind Direction (Degrees)', color='red')\n",
    "\n",
    "# Finalize plot\n",
    "plt.xlabel('Time')\n",
    "plt.ylabel('Wind Direction (Degrees)')\n",
    "plt.title('Wind Direction Comparison: KNMI vs Sonic (Feb 28 - Jun 13, 2024)')\n",
    "plt.legend()\n",
    "plt.xticks(rotation=45)\n",
    "plt.grid()\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "344f6533",
   "metadata": {},
   "outputs": [],
   "source": [
    "print(combined_data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "85b5c861",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Resample Sonic data to hourly means\n",
    "combined_data['TIMESTAMP'] = pd.to_datetime(combined_data['TIMESTAMP'], errors='coerce')\n",
    "combined_hourly = combined_data.resample('H', on='TIMESTAMP').mean().reset_index()\n",
    "combined_hourly['Timestamp']=combined_hourly['TIMESTAMP']\n",
    "print(combined_hourly)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5feb3622",
   "metadata": {},
   "outputs": [],
   "source": [
    "#\n",
    "# Merge the datasets on the nearest hourly timestamp\n",
    "merged_df = pd.merge_asof(df_filtered.sort_values('Timestamp'), combined_hourly.sort_values('Timestamp'), on='Timestamp', direction='nearest')\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4bdd5a35",
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "# Scatter plot to compare KNMI and Sonic wind direction after resampling\n",
    "plt.figure(figsize=(8, 6))\n",
    "if 'DD' in merged_df.columns and 'WindDir_D15008_Avg' in merged_df.columns:\n",
    "    plt.scatter(merged_df['DD'], merged_df['WindDir_D15008_Avg'], alpha=0.5, color='blue', s=10)\n",
    "    plt.xlabel('KNMI Wind Direction (Degrees)')\n",
    "    plt.ylabel('Sonic Wind Direction (Degrees)')\n",
    "    plt.title('Correlation between KNMI and Sonic Wind Direction (Hourly)')\n",
    "    plt.xlim(0,370)\n",
    "    plt.ylim(0,370)\n",
    "    plt.grid()\n",
    "    plt.show()\n",
    "else:\n",
    "    print(\"Error: Required columns not found in merged dataset.\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4888243f",
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "# Drop NaN values from both columns before computing correlation\n",
    "merged_df_clean = merged_df.dropna(subset=['DD', 'WindDir_D15008_Avg'])\n",
    "\n",
    "# Compute correlation metrics\n",
    "if not merged_df_clean.empty:\n",
    "    pearson_corr, pearson_p = stats.pearsonr(merged_df_clean['DD'], merged_df_clean['WindDir_D15008_Avg'])\n",
    "    spearman_corr, spearman_p = stats.spearmanr(merged_df_clean['DD'], merged_df_clean['WindDir_D15008_Avg'])\n",
    "    \n",
    "    print(f\"Pearson Correlation: {pearson_corr:.3f} (p-value: {pearson_p:.3f})\")\n",
    "    print(f\"Spearman Correlation: {spearman_corr:.3f} (p-value: {spearman_p:.3f})\")\n",
    "\n",
    "    # Scatter plot to compare KNMI and Sonic wind direction after resampling\n",
    "    plt.figure(figsize=(8, 6))\n",
    "    plt.scatter(merged_df_clean['DD'], merged_df_clean['WindDir_D15008_Avg'], alpha=0.5, color='blue', s=10)\n",
    "    plt.xlabel('KNMI Wind Direction (Degrees)')\n",
    "    plt.ylabel('Sonic Wind Direction (Degrees)')\n",
    "    plt.title(f'Correlation between KNMI and Sonic Wind Direction (Hourly)\\nPearson: {pearson_corr:.3f}, Spearman: {spearman_corr:.3f}')\n",
    "    plt.grid()\n",
    "    plt.show()\n",
    "else:\n",
    "    print(\"Error: No valid data available after dropping NaN values.\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9c5d8269",
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "\n",
    "# Make sure you already have merged_df from the previous steps\n",
    "# merged_df should contain both 'DD' (KNMI) and 'WindDir_D15008_Avg' (Sonic) columns\n",
    "\n",
    "# Drop NaNs\n",
    "merged_df_clean = merged_df.dropna(subset=['DD', 'WindDir_D15008_Avg'])\n",
    "\n",
    "# Compute raw difference\n",
    "merged_df_clean['delta_theta'] = merged_df_clean['DD'] - merged_df_clean['WindDir_D15008_Avg']\n",
    "\n",
    "# Adjust difference to be in the range [-180, 180]\n",
    "merged_df_clean['delta_theta'] = (merged_df_clean['delta_theta'] + 180) % 360 - 180\n",
    "\n",
    "# Plot histogram to visualize the distribution of delta_theta\n",
    "plt.figure(figsize=(10, 5))\n",
    "plt.hist(merged_df_clean['delta_theta'], bins=50, color='steelblue', edgecolor='black')\n",
    "plt.title('Histogram of Wind Direction Offset (Δθ = KNMI - Sonic)', fontsize=14)\n",
    "plt.xlabel('Δθ (Degrees)', fontsize=12)\n",
    "plt.ylabel('Frequency', fontsize=12)\n",
    "plt.grid(True)\n",
    "plt.tight_layout()\n",
    "plt.show()\n",
    "\n",
    "# Calculate mean and standard deviation of the offset\n",
    "mean_offset = merged_df_clean['delta_theta'].mean()\n",
    "std_offset = merged_df_clean['delta_theta'].std()\n",
    "\n",
    "print(f\"Mean Δθ (KNMI - Sonic): {mean_offset:.2f}°\")\n",
    "print(f\"Standard Deviation of Δθ: {std_offset:.2f}°\")\n",
    "\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e6869168",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Apply correction to Sonic wind direction\n",
    "merged_df_clean['WindDir_corrected'] = (merged_df_clean['WindDir_D15008_Avg'] + mean_offset) % 360\n",
    "\n",
    "# Scatter plot after correction\n",
    "plt.figure(figsize=(8, 6))\n",
    "plt.scatter(merged_df_clean['DD'], merged_df_clean['WindDir_corrected'], alpha=0.5, color='seagreen', s=10)\n",
    "plt.xlabel('KNMI Wind Direction (Degrees)')\n",
    "plt.ylabel('Corrected Sonic Wind Direction (Degrees)')\n",
    "plt.title(f'Corrected Wind Direction vs. KNMI\\nUsing Mean Offset Δθ = {mean_offset:.2f}°')\n",
    "plt.xlim(0, 360)\n",
    "plt.ylim(0, 360)\n",
    "plt.grid()\n",
    "plt.tight_layout()\n",
    "plt.show()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
